EDA on Diabetic Data, 13 Classifires and 13 Regreser Models for Classifiction and Prediction¶

Developed by Mr.Sachin M, Data Science @IPECSolutions Pvt.Ltd, Bengaluru¶

In [1]:
import pandas as pd
Data = pd.read_csv(r"C:\Users\Dell\Documents\ipec\Datas\diabetes.csv")
Data.head()
Out[1]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
0 6 148 72 35 0 33.6 0.627 50 1
1 1 85 66 29 0 26.6 0.351 31 0
2 8 183 64 0 0 23.3 0.672 32 1
3 1 89 66 23 94 28.1 0.167 21 0
4 0 137 40 35 168 43.1 2.288 33 1
In [2]:
Data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
In [3]:
Data.eq(0).sum()  
Out[3]:
Pregnancies                 111
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                     500
dtype: int64
In [4]:
print(f" Number of Rows:{Data.shape[0]}\n Number of Columns:{Data.shape[1]}")
 Number of Rows:768
 Number of Columns:9
In [5]:
Data.columns
Out[5]:
Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')
In [6]:
Data.describe()
Out[6]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
count 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000
mean 3.845052 120.894531 69.105469 20.536458 79.799479 31.992578 0.471876 33.240885 0.348958
std 3.369578 31.972618 19.355807 15.952218 115.244002 7.884160 0.331329 11.760232 0.476951
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.078000 21.000000 0.000000
25% 1.000000 99.000000 62.000000 0.000000 0.000000 27.300000 0.243750 24.000000 0.000000
50% 3.000000 117.000000 72.000000 23.000000 30.500000 32.000000 0.372500 29.000000 0.000000
75% 6.000000 140.250000 80.000000 32.000000 127.250000 36.600000 0.626250 41.000000 1.000000
max 17.000000 199.000000 122.000000 99.000000 846.000000 67.100000 2.420000 81.000000 1.000000
In [7]:
Data['DiabetesPedigreeFunction'] = round(Data['DiabetesPedigreeFunction'])
In [8]:
Data.head()
Out[8]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
0 6 148 72 35 0 33.6 1.0 50 1
1 1 85 66 29 0 26.6 0.0 31 0
2 8 183 64 0 0 23.3 1.0 32 1
3 1 89 66 23 94 28.1 0.0 21 0
4 0 137 40 35 168 43.1 2.0 33 1
In [9]:
Data.value_counts()
Out[9]:
Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin  BMI   DiabetesPedigreeFunction  Age  Outcome
0            57       60             0              0        21.7  1.0                       67   0          1
             67       76             0              0        45.3  0.0                       46   0          1
5            103      108            37             0        39.2  0.0                       65   0          1
             104      74             0              0        28.8  0.0                       48   0          1
             105      72             29             325      36.9  0.0                       28   0          1
                                                                                                            ..
2            84       50             23             76       30.4  1.0                       21   0          1
             85       65             0              0        39.6  1.0                       27   0          1
             87       0              23             0        28.9  1.0                       25   0          1
                      58             16             52       32.7  0.0                       25   0          1
17           163      72             41             114      40.9  1.0                       47   1          1
Length: 768, dtype: int64
In [10]:
Data['Pregnancies'].value_counts()
Out[10]:
1     135
0     111
2     103
3      75
4      68
5      57
6      50
7      45
8      38
9      28
10     24
11     11
13     10
12      9
14      2
15      1
17      1
Name: Pregnancies, dtype: int64
In [11]:
Data['Glucose'].value_counts()
Out[11]:
99     17
100    17
111    14
129    14
125    14
       ..
191     1
177     1
44      1
62      1
190     1
Name: Glucose, Length: 136, dtype: int64
In [12]:
Data['BloodPressure'].value_counts()
Out[12]:
70     57
74     52
78     45
68     45
72     44
64     43
80     40
76     39
60     37
0      35
62     34
66     30
82     30
88     25
84     23
90     22
86     21
58     21
50     13
56     12
52     11
54     11
75      8
92      8
65      7
85      6
94      6
48      5
96      4
44      4
100     3
106     3
98      3
110     3
55      2
108     2
104     2
46      2
30      2
122     1
95      1
102     1
61      1
24      1
38      1
40      1
114     1
Name: BloodPressure, dtype: int64
In [13]:
Data['DiabetesPedigreeFunction'].value_counts()
Out[13]:
0.0    491
1.0    267
2.0     10
Name: DiabetesPedigreeFunction, dtype: int64
In [14]:
Data['Age'].value_counts()
Out[14]:
22    72
21    63
25    48
24    46
23    38
28    35
26    33
27    32
29    29
31    24
41    22
30    21
37    19
42    18
33    17
38    16
36    16
32    16
45    15
34    14
46    13
43    13
40    13
39    12
35    10
50     8
51     8
52     8
44     8
58     7
47     6
54     6
49     5
48     5
57     5
53     5
60     5
66     4
63     4
62     4
55     4
67     3
56     3
59     3
65     3
69     2
61     2
72     1
81     1
64     1
70     1
68     1
Name: Age, dtype: int64
In [15]:
Data['Outcome'].value_counts()
Out[15]:
0    500
1    268
Name: Outcome, dtype: int64
In [16]:
Data.isnull().sum()
Out[16]:
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64
In [17]:
Data[Data.duplicated()]
Out[17]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome

Missing Value¶

In [18]:
import missingno as msno
msno.bar(Data)
Out[18]:
<Axes: >
In [19]:
print(Data.dtypes)
Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object
In [20]:
import matplotlib.pyplot as plt
des=Data.describe()
ax = des.plot(kind='barh')

plt.savefig('des.png',bbox_inches='tight')

plots¶

In [21]:
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

import warnings
warnings.filterwarnings('ignore')
In [22]:
plot_city=Data['Pregnancies'].value_counts()[0:50].reset_index()
plot_city.columns=['Pregnancies','Count']
px.bar(plot_city,x='Pregnancies',y='Count',template='gridon',
      title='Pregnancies',color='Count')
In [23]:
plot_city=Data['Glucose'].value_counts()[0:50].reset_index()
plot_city.columns=['Glucose','Count']
px.bar(plot_city,x='Glucose',y='Count',template='gridon',
      title='Glucose',color='Count')
In [24]:
plot_city=Data['BloodPressure'].value_counts()[0:50].reset_index()
plot_city.columns=['BloodPressure','Count']
px.bar(plot_city,x='BloodPressure',y='Count',template='gridon',
      title='BloodPressure',color='Count')
In [25]:
plot_city=Data['SkinThickness'].value_counts()[0:50].reset_index()
plot_city.columns=['SkinThickness','Count']
px.bar(plot_city,x='SkinThickness',y='Count',template='gridon',
      title='SkinThickness',color='Count')
In [26]:
plot_city=Data['Insulin'].value_counts()[0:50].reset_index()
plot_city.columns=['Insulin','Count']
px.bar(plot_city,x='Insulin',y='Count',template='gridon',
      title='Insulin',color='Count')
In [27]:
plot_city=Data['BMI'].value_counts()[0:50].reset_index()
plot_city.columns=['BMI','Count']
px.bar(plot_city,x='BMI',y='Count',template='gridon',
      title='BMI',color='Count')
In [28]:
plot_city=Data['DiabetesPedigreeFunction'].value_counts()[0:50].reset_index()
plot_city.columns=['DiabetesPedigreeFunction','Count']
px.bar(plot_city,x='DiabetesPedigreeFunction',y='Count',template='gridon',
      title='DiabetesPedigreeFunction',color='Count')
In [29]:
plot_city=Data['Age'].value_counts()[0:50].reset_index()
plot_city.columns=['Age','Count']
px.bar(plot_city,x='Age',y='Count',template='gridon',
      title='Age',color='Count')
In [30]:
plot_city=Data['Outcome'].value_counts()[0:50].reset_index()
plot_city.columns=['Outcome','Count']
px.bar(plot_city,x='Outcome',y='Count',template='gridon',
      title='Outcome',color='Count')
In [31]:
Data.hist(figsize = (12, 12))
plt.show()
In [32]:
import matplotlib.pyplot as plt
for i in Data.columns:
    Data.boxplot (column=i)
In [33]:
for i in Data.columns:
    Data.boxplot (column='Age')
In [34]:
for i in Data.columns:
    Data.boxplot (column='Pregnancies')
In [35]:
for i in Data.columns:
    Data.boxplot (column='Glucose')
In [36]:
for i in Data.columns:
    Data.boxplot (column='BloodPressure')
In [37]:
for i in Data.columns:
    Data.boxplot (column='SkinThickness')
In [38]:
for i in Data.columns:
    Data.boxplot (column='Insulin')
In [39]:
for i in Data.columns:
    Data.boxplot (column='BMI')
In [40]:
for i in Data.columns:
    Data.boxplot (column='DiabetesPedigreeFunction')
In [41]:
for i in Data.columns:
    Data.boxplot (column='Outcome')
In [42]:
q1 = Data.quantile(0.25)
q3 = Data.quantile(0.75)
iqr = q3 - q1
threshold = 1.5
Data_no_outliers = Data[~((Data < (q1 - threshold * iqr)) |(Data > (q3 + threshold * iqr))).any(axis=1)]
cleadData=Data_no_outliers
In [43]:
cleadData.to_csv('clean1.csv')
In [44]:
Data1 = pd.read_csv('clean1.csv')
Data1.head()
Out[44]:
Unnamed: 0 Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
0 0 6 148 72 35 0 33.6 1.0 50 1
1 1 1 85 66 29 0 26.6 0.0 31 0
2 2 8 183 64 0 0 23.3 1.0 32 1
3 3 1 89 66 23 94 28.1 0.0 21 0
4 4 0 137 40 35 168 43.1 2.0 33 1
In [45]:
Data1.drop(['Unnamed: 0'], axis=1)
Out[45]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
0 6 148 72 35 0 33.6 1.0 50 1
1 1 85 66 29 0 26.6 0.0 31 0
2 8 183 64 0 0 23.3 1.0 32 1
3 1 89 66 23 94 28.1 0.0 21 0
4 0 137 40 35 168 43.1 2.0 33 1
... ... ... ... ... ... ... ... ... ...
659 10 101 76 48 180 32.9 0.0 63 0
660 2 122 70 27 0 36.8 0.0 27 0
661 5 121 72 23 112 26.2 0.0 30 0
662 1 126 60 0 0 30.1 0.0 47 1
663 1 93 70 31 0 30.4 0.0 23 0

664 rows × 9 columns

In [46]:
for i in Data1.columns:
    Data1.boxplot (column='BMI')
In [47]:
sns.barplot(x='Pregnancies',data=Data1)
plt.show()
In [48]:
sns.barplot(x='Glucose',data=Data1)
plt.show()
In [49]:
sns.barplot(x='BloodPressure',data=Data1)
plt.show()
In [50]:
sns.barplot(x='SkinThickness',data=Data1)
plt.show()
In [51]:
sns.barplot(x='Insulin',data=Data1)
plt.show()
In [52]:
sns.barplot(x='BMI',data=Data1)
plt.show()
In [53]:
sns.barplot(x='DiabetesPedigreeFunction',data=Data1)
plt.show()
In [54]:
sns.barplot(x='Age',data=Data1)
plt.show()
In [55]:
sns.barplot(x='Outcome',data=Data1)
plt.show()
In [56]:
sns.barplot(x='Pregnancies',y='Outcome',data=Data)
plt.xticks(rotation=90)
plt.show()

feature of importance¶

In [58]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.ensemble import ExtraTreesClassifier
import warnings
import scipy.stats
warnings.filterwarnings('ignore')
In [59]:
x=Data1[['Glucose', 'BMI', 'Age', 'Pregnancies', 'SkinThickness',
       'Insulin', 'DiabetesPedigreeFunction']]
y=Data1.iloc[:,8]

model = ExtraTreesClassifier()
model.fit(x,y)
print(model.feature_importances_) 
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=x.columns)
feat_importances.nlargest(20).plot(kind='bar')
plt.show()
[0.12742677 0.13117162 0.43271221 0.10251247 0.09381464 0.07521079
 0.0371515 ]
In [60]:
!pip install imblearn
Requirement already satisfied: imblearn in c:\users\dell\anaconda3\lib\site-packages (0.0)
Requirement already satisfied: imbalanced-learn in c:\users\dell\anaconda3\lib\site-packages (from imblearn) (0.12.3)
Requirement already satisfied: numpy>=1.17.3 in c:\users\dell\anaconda3\lib\site-packages (from imbalanced-learn->imblearn) (1.24.4)
Requirement already satisfied: scipy>=1.5.0 in c:\users\dell\anaconda3\lib\site-packages (from imbalanced-learn->imblearn) (1.10.1)
Requirement already satisfied: scikit-learn>=1.0.2 in c:\users\dell\anaconda3\lib\site-packages (from imbalanced-learn->imblearn) (1.4.2)
Requirement already satisfied: joblib>=1.1.1 in c:\users\dell\anaconda3\lib\site-packages (from imbalanced-learn->imblearn) (1.3.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\dell\anaconda3\lib\site-packages (from imbalanced-learn->imblearn) (2.2.0)
[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip
In [60]:
Data1.columns
Out[60]:
Index(['Unnamed: 0', 'Pregnancies', 'Glucose', 'BloodPressure',
       'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age',
       'Outcome'],
      dtype='object')
In [61]:
X = Data1[['Pregnancies', 'Glucose', 'BloodPressure','SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']]
y = Data1[['Outcome']]
In [62]:
y
Out[62]:
Outcome
0 1
1 0
2 1
3 0
4 1
... ...
659 0
660 0
661 0
662 1
663 0

664 rows × 1 columns

In [63]:
X
Out[63]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age
0 6 148 72 35 0 33.6 1.0 50
1 1 85 66 29 0 26.6 0.0 31
2 8 183 64 0 0 23.3 1.0 32
3 1 89 66 23 94 28.1 0.0 21
4 0 137 40 35 168 43.1 2.0 33
... ... ... ... ... ... ... ... ...
659 10 101 76 48 180 32.9 0.0 63
660 2 122 70 27 0 36.8 0.0 27
661 5 121 72 23 112 26.2 0.0 30
662 1 126 60 0 0 30.1 0.0 47
663 1 93 70 31 0 30.4 0.0 23

664 rows × 8 columns

In [64]:
import missingno as msno
msno.bar(Data1)
Out[64]:
<Axes: >

Before Handling Imbalance Nature of Data¶

In [65]:
plt.figure(figsize=(6,4))
sns.countplot(x='Outcome', data=Data1)
plt.title('Countplot of Outcome ')
plt.show()
In [66]:
import imblearn
In [67]:
import sklearn
import imblearn

print(sklearn.__version__)
print(imblearn.__version__)
1.4.2
0.12.3
In [68]:
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE 
from collections import Counter
In [69]:
X = Data1[['Pregnancies', 'Glucose', 'BloodPressure','SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']]
y = Data1[['Outcome']]
In [70]:
print('Original dataset shape %s' ,(X.shape,y.shape))

sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y)
print('Resampled dataset shape %s', (X_res.shape, y_res.shape))
Original dataset shape %s ((664, 8), (664, 1))
Resampled dataset shape %s ((898, 8), (898, 1))
In [71]:
sns.kdeplot(data=Data1, x='Age', bw_method=.15, hue='Outcome')
Out[71]:
<Axes: xlabel='Age', ylabel='Density'>
In [72]:
# 
In [73]:
sns.kdeplot(data=Data1, x='BMI', bw_method=.15, hue='Outcome')
Out[73]:
<Axes: xlabel='BMI', ylabel='Density'>
In [74]:
Data1['Outcome'].value_counts()
Out[74]:
0    449
1    215
Name: Outcome, dtype: int64
In [75]:
plt.pie([449,215], labels = ['Non-Diabetic', 'Diabetic'], colors = ['hotpink', 'navy'])
plt.axis('equal')
plt.title('Number of Diabetic and Non-Diabetic Patients');
In [ ]:
 
In [76]:
sns.distplot(Data1.Age)
Out[76]:
<Axes: xlabel='Age', ylabel='Density'>
In [77]:
sns.distplot(Data1.Outcome)
Out[77]:
<Axes: xlabel='Outcome', ylabel='Density'>

Gaussian Naive Bayes¶

In [78]:
#X = Data1[['Pregnancies', 'Glucose', 'BloodPressure','SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']]
#y = Data1[['Outcome']]
x_train, x_test, y_train, y_test = train_test_split(X_res, y_res, random_state=0,test_size=0.2)
In [79]:
print("X Training Samples",x_train.shape[0])
print("Y Training Samples",y_train.shape[0])
print("X Test Samples",x_test.shape[0])
print("Y Test Samples",y_test.shape[0])
X Training Samples 718
Y Training Samples 718
X Test Samples 180
Y Test Samples 180
In [80]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(x_train, y_train)
print('Accuracy of GBN classifier on training set: {:.2f}%'
    .format(gnb.score(x_train, y_train)*100))
print('Accuracy of GBN classifier on test set: {:.2f}%'
      .format(gnb.score(x_test, y_test)*100))
Accuracy of GBN classifier on training set: 73.26%
Accuracy of GBN classifier on test set: 75.00%
In [81]:
gnbTrain=gnb.score(x_train, y_train)*100
gnbTest=gnb.score(x_test, y_test)*100
In [82]:
gnbTrain
Out[82]:
73.25905292479109
In [83]:
from sklearn.metrics import classification_report
pred = gnb.predict(x_test)

print(classification_report(y_test, pred))
              precision    recall  f1-score   support

           0       0.73      0.77      0.75        88
           1       0.77      0.73      0.75        92

    accuracy                           0.75       180
   macro avg       0.75      0.75      0.75       180
weighted avg       0.75      0.75      0.75       180

In [84]:
pred = gnb.predict(x_test)
In [85]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, pred)
cm
Out[85]:
array([[68, 20],
       [25, 67]], dtype=int64)
In [86]:
import seaborn as sns
import numpy as np
#print(cm)
sns.heatmap(cm/np.sum(cm), annot=True, fmt='.2%', cmap='Greens')
#plt.savefig('confusion.png')
plt.show()

Logistic Regression¶

In [87]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1e5)
logreg.fit(x_train, y_train)
print('Accuracy of Logistic regression classifier on training set: {:.2f}'
     .format(logreg.score(x_train, y_train)*100))
print('Accuracy of Logistic regression classifier on test set: {:.2f}'
     .format(logreg.score(x_test, y_test)*100))
Accuracy of Logistic regression classifier on training set: 71.03
Accuracy of Logistic regression classifier on test set: 74.44
In [88]:
LGTrain=logreg.score(x_train, y_train)*100
LGTest=logreg.score(x_test, y_test)*100
In [89]:
LGTest
Out[89]:
74.44444444444444
In [90]:
pred = logreg.predict(x_test)
In [91]:
LogM = confusion_matrix(y_test, pred)
sns.heatmap(LogM/np.sum(LogM), annot=True, fmt='.2%', cmap='Greens')
plt.show()

Confusion Matrix¶

In [92]:
#from sklearn.metrics import confusion_matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
#sns.heatmap(LogM, annot=True)
disp = ConfusionMatrixDisplay(confusion_matrix=LogM)
disp.plot()

plt.show()
In [93]:
new_sample = [[ 7, 144, 69, 34, 33.8, 1.3, 40, 0]]
new_sample_Data1 = pd.DataFrame(new_sample, columns=x_train.columns)

y_pred = gnb.predict(new_sample_Data1)
print("Predicted new sample:", y_pred[0],"1st Group")
Predicted new sample: 0 1st Group

Linear Discriminant Analysis¶

In [94]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lad = LinearDiscriminantAnalysis()
lad.fit(x_train, y_train)
print('Accuracy of LAD classifier on training set: {:.2f}'
     .format(lad.score(x_train, y_train)))
print('Accuracy of LAD classifier on test set: {:.2f}'
     .format(lad.score(x_test, y_test)))
Accuracy of LAD classifier on training set: 0.74
Accuracy of LAD classifier on test set: 0.74
In [95]:
LDATrain=lad.score(x_train, y_train)*100
LDATest=lad.score(x_test, y_test)*100
In [96]:
LDATest
Out[96]:
74.44444444444444
In [97]:
pred = lad.predict(x_test)

print(classification_report(y_test, pred))
              precision    recall  f1-score   support

           0       0.72      0.77      0.75        88
           1       0.77      0.72      0.74        92

    accuracy                           0.74       180
   macro avg       0.75      0.75      0.74       180
weighted avg       0.75      0.74      0.74       180

In [98]:
predLDA = lad.predict(x_test)
In [99]:
LDAcm = confusion_matrix(y_test, predLDA)
sns.heatmap(LDAcm/np.sum(LDAcm), annot=True, fmt='.2%', cmap='Greens')
Out[99]:
<Axes: >
In [100]:
LDAcm = confusion_matrix(y_test, predLDA)
sns.heatmap(LDAcm,annot=True)
disp = ConfusionMatrixDisplay(confusion_matrix=LDAcm)
disp.plot()

plt.show()

Support Vector Machine¶

In [101]:
from sklearn.svm import SVC
svm = SVC()
svm.fit(x_train, y_train)
print('Accuracy of SVM classifier on training set: {:.2f}'
     .format(svm.score(x_train, y_train)*100))
print('Accuracy of SVM classifier on test set: {:.2f}'
     .format(svm.score(x_test, y_test)*100))
Accuracy of SVM classifier on training set: 72.42
Accuracy of SVM classifier on test set: 72.78
In [102]:
SVMTrain=svm.score(x_train, y_train)*100
SVMTest=svm.score(x_test, y_test)*100
In [103]:
SVMTest
Out[103]:
72.77777777777777
In [104]:
pred = svm.predict(x_test)

print(classification_report(y_test, pred))
              precision    recall  f1-score   support

           0       0.72      0.72      0.72        88
           1       0.73      0.74      0.74        92

    accuracy                           0.73       180
   macro avg       0.73      0.73      0.73       180
weighted avg       0.73      0.73      0.73       180

In [105]:
pred = svm.predict(x_test)
In [106]:
SVM = confusion_matrix(y_test, pred)
sns.heatmap(SVM /np.sum(SVM ), annot=True, fmt='.2%', cmap='Greens')
plt.show()
In [107]:
#from sklearn.metrics import confusion_matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
#sns.heatmap(LogM, annot=True)
disp = ConfusionMatrixDisplay(confusion_matrix=SVM)
disp.plot()

plt.show()

DecisionTree¶

In [108]:
from sklearn.model_selection import train_test_split

Xtrain, Xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=1)

print("Training samples:", Xtrain.shape)
print("Testing samples:", Xtest.shape)
Training samples: (531, 7)
Testing samples: (133, 7)
In [109]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf =clf.fit(Xtrain, ytrain)
y_pred = clf.predict(Xtest)
In [110]:
from sklearn import metrics
print("Accuracy:{0}%".format(metrics.accuracy_score(ytest,y_pred)*100))
Accuracy:69.17293233082707%
In [111]:
DTTest=metrics.accuracy_score(ytest,y_pred)*100
In [112]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import numpy as np
cm = confusion_matrix(ytest, y_pred)

print(cm)
sns.heatmap(cm/np.sum(cm), annot=True,fmt='.2%',cmap='Blues')
[[68 19]
 [22 24]]
Out[112]:
<Axes: >
In [113]:
from sklearn.metrics import classification_report
print(classification_report(ytest, y_pred))
              precision    recall  f1-score   support

           0       0.76      0.78      0.77        87
           1       0.56      0.52      0.54        46

    accuracy                           0.69       133
   macro avg       0.66      0.65      0.65       133
weighted avg       0.69      0.69      0.69       133

Logistic Regression¶

In [114]:
# from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score



# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the model
model = LogisticRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)*100
print(f'Accuracy: {accuracy}')
Accuracy: 67.66917293233082
In [115]:
LRTEST=accuracy_score(y_test, y_pred)*100
In [116]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import numpy as np
cm = confusion_matrix(ytest, y_pred)

print(cm)
sns.heatmap(cm/np.sum(cm), annot=True,fmt='.2%',cmap='Blues')
[[61 26]
 [31 15]]
Out[116]:
<Axes: >
In [117]:
from sklearn.metrics import classification_report
print(classification_report(ytest, y_pred))
              precision    recall  f1-score   support

           0       0.66      0.70      0.68        87
           1       0.37      0.33      0.34        46

    accuracy                           0.57       133
   macro avg       0.51      0.51      0.51       133
weighted avg       0.56      0.57      0.57       133

Random Forest¶

In [118]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the model
model = RandomForestClassifier()

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)*100
print(f'Accuracy: {accuracy}')
Accuracy: 72.93233082706767
In [119]:
RFTest=accuracy_score(y_test, y_pred)*100
In [120]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import numpy as np
cm = confusion_matrix(ytest, y_pred)

print(cm)
sns.heatmap(cm/np.sum(cm), annot=True,fmt='.2%',cmap='Blues')
[[62 25]
 [35 11]]
Out[120]:
<Axes: >
In [121]:
from sklearn.metrics import classification_report
print(classification_report(ytest, y_pred))
              precision    recall  f1-score   support

           0       0.64      0.71      0.67        87
           1       0.31      0.24      0.27        46

    accuracy                           0.55       133
   macro avg       0.47      0.48      0.47       133
weighted avg       0.52      0.55      0.53       133

XGBoost¶

In [122]:
#! pip install xgboost
In [123]:
import xgboost as xgb
from sklearn.metrics import accuracy_score

# Initialize the model
model = xgb.XGBClassifier()

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)*100
print(f'Accuracy: {accuracy}')
Accuracy: 70.67669172932331
In [124]:
XGBTest=accuracy_score(y_test, y_pred)*100
In [125]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import numpy as np
cm = confusion_matrix(ytest, y_pred)

print(cm)
sns.heatmap(cm/np.sum(cm), annot=True,fmt='.2%',cmap='Blues')
[[62 25]
 [30 16]]
Out[125]:
<Axes: >
In [126]:
from sklearn.metrics import classification_report
print(classification_report(ytest, y_pred))
              precision    recall  f1-score   support

           0       0.67      0.71      0.69        87
           1       0.39      0.35      0.37        46

    accuracy                           0.59       133
   macro avg       0.53      0.53      0.53       133
weighted avg       0.58      0.59      0.58       133

Random Forest regression¶

In [127]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize the Random Forest regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model
rf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf.predict(X_test)

# Calculate the Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Calculate the Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)
print(f'Root Mean Squared Error: {rmse}')
Mean Squared Error: 0.20292631578947368
Root Mean Squared Error: 0.4504734351651312

K-Nearest Neighbors (KNN)¶

In [129]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize the KNN classifier with k=5
knn = KNeighborsClassifier(n_neighbors=5)

# Fit the model
knn.fit(X_train, y_train)

# Make predictions on the test set
y_pred = knn.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
Accuracy: 0.7218045112781954
In [130]:
KNNTest=accuracy_score(y_test, y_pred)*100
In [131]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import numpy as np
cm = confusion_matrix(ytest, y_pred)

print(cm)
sns.heatmap(cm/np.sum(cm), annot=True,fmt='.2%',cmap='Blues')
[[64 23]
 [34 12]]
Out[131]:
<Axes: >
In [132]:
from sklearn.metrics import classification_report
print(classification_report(ytest, y_pred))
              precision    recall  f1-score   support

           0       0.65      0.74      0.69        87
           1       0.34      0.26      0.30        46

    accuracy                           0.57       133
   macro avg       0.50      0.50      0.49       133
weighted avg       0.55      0.57      0.56       133

Artificial Neural Networks¶

In [133]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelBinarizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# One-hot encode the labels
lb = LabelBinarizer()
y_train = lb.fit_transform(y_train)
y_test = lb.transform(y_test)

# Initialize the model
model = Sequential()

# Add input layer and first hidden layer
model.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],)))

# Add second hidden layer
model.add(Dense(32, activation='relu'))

# Add output layer
model.add(Dense(y_train.shape[1], activation='softmax'))

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)

# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy}')
Epoch 1/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 2s 29ms/step - accuracy: 0.3489 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 2/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - accuracy: 0.3281 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 3/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step - accuracy: 0.3183 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 4/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 13ms/step - accuracy: 0.2999 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 5/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - accuracy: 0.3485 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 6/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.3471 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 7/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.2812 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 8/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.3073 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 9/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step - accuracy: 0.3597 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 10/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 12ms/step - accuracy: 0.3158 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 11/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - accuracy: 0.3246 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 12/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.3202 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 13/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.2964 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 14/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.3250 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 15/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step - accuracy: 0.3070 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 16/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.3193 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 17/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.3271 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 18/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 11ms/step - accuracy: 0.3151 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 19/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step - accuracy: 0.3026 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 20/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - accuracy: 0.3377 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 21/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step - accuracy: 0.3198 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 22/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.3191 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 23/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.3318 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 24/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.3241 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 25/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.2846 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 26/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.2628 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 27/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.3016 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 28/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step - accuracy: 0.3318 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 29/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - accuracy: 0.3285 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 30/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.3362 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 31/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.3159 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 32/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step - accuracy: 0.3386 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 33/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.3902 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 34/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.3210 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 35/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.3095 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 36/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 13ms/step - accuracy: 0.3018 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 37/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 11ms/step - accuracy: 0.3006 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 38/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.3284 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 39/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.3135 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 40/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.3154 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 41/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.3351 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 42/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.3153 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 43/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - accuracy: 0.3401 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 44/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - accuracy: 0.3285 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 45/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.3156 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 46/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.3064 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 47/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step - accuracy: 0.3274 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 48/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step - accuracy: 0.3690 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 49/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 9ms/step - accuracy: 0.3266 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
Epoch 50/50
14/14 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - accuracy: 0.3327 - loss: 0.0000e+00 - val_accuracy: 0.3084 - val_loss: 0.0000e+00
5/5 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step - accuracy: 0.3390 - loss: 0.0000e+00 
Test Accuracy: 0.3308270573616028
In [134]:
ANNTest=model.evaluate(X_test, y_test)*100
5/5 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step - accuracy: 0.3390 - loss: 0.0000e+00 
In [135]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import numpy as np
cm = confusion_matrix(ytest, y_pred)

print(cm)
sns.heatmap(cm/np.sum(cm), annot=True,fmt='.2%',cmap='Blues')
[[64 23]
 [34 12]]
Out[135]:
<Axes: >

Extra Trees¶

In [136]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize the Extra Trees classifier
etc = ExtraTreesClassifier(n_estimators=100, random_state=42)

# Fit the model
etc.fit(X_train, y_train)

# Make predictions on the test set
y_pred = etc.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
Accuracy: 0.706766917293233
In [140]:
ETTest=accuracy_score(y_test, y_pred)*100
In [137]:
ETTest
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[137], line 1
----> 1 ETTest

NameError: name 'ETTest' is not defined
In [ ]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import numpy as np
cm = confusion_matrix(ytest, y_pred)

print(cm)
sns.heatmap(cm/np.sum(cm), annot=True,fmt='.2%',cmap='Blues')

AdaBoost¶

In [138]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score

# Initialize the model
model = AdaBoostClassifier()

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
Accuracy: 0.6992481203007519
In [139]:
ADBTest=accuracy_score(y_test, y_pred)*100
ADBTest
Out[139]:
69.92481203007519
In [140]:
#print(ADBTest,ETTest,ANNTest,KNNTest,XGBTest,RFTest,LRTEST,DTTest,SVMTest,LDATest,gnbTest)
In [141]:
gnbTest
Out[141]:
75.0
In [142]:
D ={"ADBTest": 69.92 , "ETTest": 70.67, "ANNTest": 33.08, "KNNTest": 72.18, "XGBTest": 70.67, "RFTest": 72.93, "LRTEST": 67.66, "DTTest" :52.63, "SVMTest" :66.91, "LDATest":66.91, "gnbTest" :55.63}
In [143]:
Data2 = pd.DataFrame(D, index=[0])
print(Data2)
   ADBTest  ETTest  ANNTest  KNNTest  XGBTest  RFTest  LRTEST  DTTest  \
0    69.92   70.67    33.08    72.18    70.67   72.93   67.66   52.63   

   SVMTest  LDATest  gnbTest  
0    66.91    66.91    55.63  
In [144]:
Data2.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1 entries, 0 to 0
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   ADBTest  1 non-null      float64
 1   ETTest   1 non-null      float64
 2   ANNTest  1 non-null      float64
 3   KNNTest  1 non-null      float64
 4   XGBTest  1 non-null      float64
 5   RFTest   1 non-null      float64
 6   LRTEST   1 non-null      float64
 7   DTTest   1 non-null      float64
 8   SVMTest  1 non-null      float64
 9   LDATest  1 non-null      float64
 10  gnbTest  1 non-null      float64
dtypes: float64(11)
memory usage: 96.0 bytes
In [145]:
Data2 = round(Data2)
In [146]:
Data2.head()
Out[146]:
ADBTest ETTest ANNTest KNNTest XGBTest RFTest LRTEST DTTest SVMTest LDATest gnbTest
0 70.0 71.0 33.0 72.0 71.0 73.0 68.0 53.0 67.0 67.0 56.0
In [149]:
ax = Data2.plot.bar(use_index=True, rot=0, color='#1b9e77')

plt.show()
In [ ]: